* l3-and-l2h-only shadow mode for PAE PV guests that would allow them
* to share l2h pages again.
*
- * GUEST_WALK_TABLES TLB FLUSH COALESCE
- * guest_walk_tables can do up to three remote TLB flushes as it walks to
- * the first l1 of a new pagetable. Should coalesce the flushes to the end,
- * and if we do flush, re-do the walk. If anything has changed, then
- * pause all the other vcpus and do the walk *again*.
- *
* PSE disabled / PSE36
* We don't support any modes other than PSE enabled, PSE36 disabled.
* Neither of those would be hard to change, but we'd need to be able to
return 0;
}
+/* This validation is called with lock held, and after write permission
+ * removal. Then check is atomic and no more inconsistent content can
+ * be observed before lock is released
+ *
+ * Return 1 to indicate success and 0 for inconsistency
+ */
+static inline uint32_t
+shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw)
+{
+ struct domain *d = v->domain;
+ guest_l1e_t *l1p;
+ guest_l2e_t *l2p;
+#if GUEST_PAGING_LEVELS >= 4
+ guest_l3e_t *l3p;
+ guest_l4e_t *l4p;
+#endif
+
+ ASSERT(shadow_locked_by_me(d));
+
+ if ( gw->version ==
+ atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
+ return 1;
+
+ /* We may consider caching guest page mapping from last
+ * guest table walk. However considering this check happens
+ * relatively less-frequent, and a bit burden here to
+ * remap guest page is better than caching mapping in each
+ * guest table walk.
+ *
+ * Also when inconsistency occurs, simply return to trigger
+ * another fault instead of re-validate new path to make
+ * logic simple.
+ */
+ perfc_incr(shadow_check_gwalk);
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
+ if ( gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4 )
+ return 0;
+ l3p = sh_map_domain_page(gw->l3mfn);
+ if ( gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3 )
+ return 0;
+#else
+ if ( gw->l3e.l3 !=
+ v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3 )
+ return 0;
+#endif
+ l2p = sh_map_domain_page(gw->l2mfn);
+ if ( gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2 )
+ return 0;
+#else
+ l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
+ if ( gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2 )
+ return 0;
+#endif
+ if ( !(guest_supports_superpages(v) &&
+ (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
+ {
+ l1p = sh_map_domain_page(gw->l1mfn);
+ if ( gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1 )
+ return 0;
+ }
+
+ return 1;
+}
+
+/* Remove write access permissions from a gwalk_t in a batch, and
+ * return OR-ed result for TLB flush hint
+ */
+static inline uint32_t
+gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
+{
+ int rc = 0;
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ rc = sh_remove_write_access(v, gw->l3mfn, 3, va);
+#endif
+ rc |= sh_remove_write_access(v, gw->l2mfn, 2, va);
+#endif
+ if ( !(guest_supports_superpages(v) &&
+ (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
+ rc |= sh_remove_write_access(v, gw->l1mfn, 1, va);
+
+ return rc;
+}
+
/* Walk the guest pagetables, after the manner of a hardware walker.
*
* Inputs: a vcpu, a virtual address, a walk_t to fill, a
- * pointer to a pagefault code, and a flag "shadow_op".
+ * pointer to a pagefault code
*
* We walk the vcpu's guest pagetables, filling the walk_t with what we
* see and adding any Accessed and Dirty bits that are needed in the
* we go. For the purposes of reading pagetables we treat all non-RAM
* memory as contining zeroes.
*
- * If "shadow_op" is non-zero, we are serving a genuine guest memory access,
- * and must (a) be under the shadow lock, and (b) remove write access
- * from any guest PT pages we see, as we will be shadowing them soon
- * and will rely on the contents' not having changed.
+ * The walk is done in a lock-free style, with some sanity check postponed
+ * after grabbing shadow lock later. Those delayed checks will make sure
+ * no inconsistent mapping being translated into shadow page table.
*
* Returns 0 for success, or the set of permission bits that we failed on
* if the walk did not complete.
* checked the old return code anyway.
*/
static uint32_t
-guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw,
- uint32_t pfec, int shadow_op)
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec)
{
struct domain *d = v->domain;
p2m_type_t p2mt;
uint32_t gflags, mflags, rc = 0;
int pse;
- ASSERT(!shadow_op || shadow_locked_by_me(d));
-
perfc_incr(shadow_guest_walk);
memset(gw, 0, sizeof(*gw));
gw->va = va;
+ gw->version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
+ rmb();
+
/* Mandatory bits that must be set in every entry. We invert NX, to
* calculate as if there were an "X" bit that allowed access.
* We will accumulate, in rc, the set of flags that are missing. */
goto out;
}
ASSERT(mfn_valid(gw->l3mfn));
- /* This mfn is a pagetable: make sure the guest can't write to it. */
- if ( shadow_op && sh_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
- flush_tlb_mask(d->domain_dirty_cpumask);
+
/* Get the l3e and check its flags*/
l3p = sh_map_domain_page(gw->l3mfn);
gw->l3e = l3p[guest_l3_table_offset(va)];
goto out;
}
ASSERT(mfn_valid(gw->l2mfn));
- /* This mfn is a pagetable: make sure the guest can't write to it. */
- if ( shadow_op && sh_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
- flush_tlb_mask(d->domain_dirty_cpumask);
+
/* Get the l2e */
l2p = sh_map_domain_page(gw->l2mfn);
gw->l2e = l2p[guest_l2_table_offset(va)];
goto out;
}
ASSERT(mfn_valid(gw->l1mfn));
- /* This mfn is a pagetable: make sure the guest can't write to it. */
- if ( shadow_op
- && sh_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
- flush_tlb_mask(d->domain_dirty_cpumask);
l1p = sh_map_domain_page(gw->l1mfn);
gw->l1e = l1p[guest_l1_table_offset(va)];
gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
// XXX -- this is expensive, but it's easy to cobble together...
// FIXME!
- shadow_lock(v->domain);
- if ( guest_walk_tables(v, addr, &gw, PFEC_page_present, 1) == 0
+ if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0
&& mfn_valid(gw.l1mfn) )
{
if ( gl1mfn )
(guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
}
- shadow_unlock(v->domain);
-
return pl1e;
}
// XXX -- this is expensive, but it's easy to cobble together...
// FIXME!
- shadow_lock(v->domain);
- (void) guest_walk_tables(v, addr, &gw, PFEC_page_present, 1);
+ (void) guest_walk_tables(v, addr, &gw, PFEC_page_present);
*(guest_l1e_t *)eff_l1e = gw.l1e;
- shadow_unlock(v->domain);
}
#endif /* CONFIG==SHADOW==GUEST */
return 0;
}
- shadow_lock(d);
-
- shadow_audit_tables(v);
-
- if ( guest_walk_tables(v, va, &gw, regs->error_code, 1) != 0 )
+ if ( guest_walk_tables(v, va, &gw, regs->error_code) != 0 )
{
perfc_incr(shadow_fault_bail_real_fault);
- goto not_a_shadow_fault;
+ SHADOW_PRINTK("not a shadow fault\n");
+ reset_early_unshadow(v);
+ return 0;
}
/* It's possible that the guest has put pagetables in memory that it has
if ( unlikely(d->is_shutting_down) )
{
SHADOW_PRINTK("guest is shutting down\n");
- shadow_unlock(d);
return 0;
}
- sh_audit_gw(v, &gw);
-
/* What kind of access are we dealing with? */
ft = ((regs->error_code & PFEC_write_access)
? ft_demand_write : ft_demand_read);
perfc_incr(shadow_fault_bail_bad_gfn);
SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
gfn_x(gfn), mfn_x(gmfn));
- goto not_a_shadow_fault;
+ reset_early_unshadow(v);
+ return 0;
}
#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
regs->error_code | PFEC_page_present);
#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
+ shadow_lock(d);
+ shadow_audit_tables(v);
+ sh_audit_gw(v, &gw);
+
+ if ( gw_remove_write_accesses(v, va, &gw) )
+ {
+ /* Write permission removal is also a hint that other gwalks
+ * overlapping with this one may be inconsistent
+ */
+ perfc_incr(shadow_rm_write_flush_tlb);
+ atomic_inc(&d->arch.paging.shadow.gtable_dirty_version);
+ flush_tlb_mask(d->domain_dirty_cpumask);
+ }
+
+ if ( !shadow_check_gwalk(v, va, &gw) )
+ {
+ perfc_incr(shadow_inconsistent_gwalk);
+ shadow_unlock(d);
+ return EXCRET_fault_fixed;
+ }
+
/* Make sure there is enough free shadow memory to build a chain of
* shadow tables. (We never allocate a top-level shadow on this path,
* only a 32b l1, pae l1, or 64b l3+2+1. Note that while
return vtlb_gfn;
#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
- if ( guest_walk_tables(v, va, &gw, pfec[0], 0) != 0 )
+ if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 )
{
if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
pfec[0] &= ~PFEC_page_present;
}
else
sh_unmap_domain_page(addr);
+
+ atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
}
int